{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.5/dist-packages/h5py/__init__.py:36: FutureWarning: Conversion of the second argument of issubdtype from `float` to `np.floating` is deprecated. In future, it will be treated as `np.float64 == np.dtype(float).type`.\n",
      "  from ._conv import register_converters as _register_converters\n",
      "/usr/local/lib/python3.5/dist-packages/sklearn/cross_validation.py:41: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.\n",
      "  \"This module will be removed in 0.20.\", DeprecationWarning)\n"
     ]
    }
   ],
   "source": [
    "from utils import *\n",
    "import tensorflow as tf\n",
    "from collections import Counter\n",
    "from sklearn.cross_validation import train_test_split\n",
    "import time"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['negative', 'positive']\n",
      "10662\n",
      "10662\n"
     ]
    }
   ],
   "source": [
    "trainset = sklearn.datasets.load_files(container_path = 'data', encoding = 'UTF-8')\n",
    "trainset.data, trainset.target = separate_dataset(trainset,1.0)\n",
    "print(trainset.target_names)\n",
    "print(len(trainset.data))\n",
    "print(len(trainset.target))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Total words: 197234\n"
     ]
    }
   ],
   "source": [
    "texts = ' '.join(trainset.data)\n",
    "words = texts.split()\n",
    "word2freq = Counter(words)\n",
    "print(\"Total words:\", len(words))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Vocabulary size: 20465\n"
     ]
    }
   ],
   "source": [
    "_words = set(words)\n",
    "word2idx = {c: i for i, c in enumerate(_words)}\n",
    "idx2word = {i: c for i, c in enumerate(_words)}\n",
    "vocab_size = len(idx2word)\n",
    "indexed = [word2idx[w] for w in words]\n",
    "print('Vocabulary size:', vocab_size)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "class SKIPGRAM:\n",
    "    def __init__(self, sample_size, vocab_size, embedded_size, window_size=3):\n",
    "        self.X = tf.placeholder(tf.int32, shape=[None])\n",
    "        self.Y = tf.placeholder(tf.int32, shape=[None, 1])\n",
    "        self.embedding = tf.Variable(tf.truncated_normal([vocab_size, embedded_size],\n",
    "                                                      stddev=1.0 / np.sqrt(embedded_size)))\n",
    "        self.bias = tf.Variable(tf.zeros([vocab_size]))\n",
    "        embedded = tf.nn.embedding_lookup(self.embedding, self.X)\n",
    "        self.cost = tf.reduce_mean(tf.nn.sampled_softmax_loss(\n",
    "            weights=self.embedding,\n",
    "            biases=self.bias,\n",
    "            labels=self.Y,\n",
    "            inputs=embedded,\n",
    "            num_sampled=sample_size,\n",
    "            num_classes=vocab_size))\n",
    "        self.optimizer = tf.train.AdamOptimizer().minimize(self.cost)\n",
    "        self.valid_dataset = tf.placeholder(tf.int32, shape=[None])\n",
    "        norm = tf.sqrt(tf.reduce_sum(tf.square(self.embedding), 1, keep_dims=True))\n",
    "        normalized_embeddings = self.embedding / norm\n",
    "        valid_embeddings = tf.nn.embedding_lookup(normalized_embeddings, self.valid_dataset)\n",
    "        self.similarity = tf.matmul(valid_embeddings, normalized_embeddings, transpose_b=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "batch_size = 128\n",
    "embedded_size = 128\n",
    "skip_window = 5\n",
    "epoch = 10\n",
    "valid_size = 10\n",
    "nearest_neighbors = 8"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "tf.reset_default_graph()\n",
    "sess = tf.InteractiveSession()\n",
    "model = SKIPGRAM(batch_size,vocab_size,embedded_size)\n",
    "sess.run(tf.global_variables_initializer())"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "def get_y(words, idx):\n",
    "    skipped = np.random.randint(1, skip_window+1)\n",
    "    left = idx - skip_window if (idx - skipped) > 0 else 0\n",
    "    right = idx + skipped\n",
    "    y = words[left: idx] + words[idx+1: right+1]\n",
    "    return list(set(y))\n",
    "\n",
    "def make_xy(int_words):\n",
    "    x,y = [], []\n",
    "    for i in range(0, len(int_words)):\n",
    "        input_w = int_words[i]\n",
    "        labels = get_y(int_words, i)\n",
    "        x.extend([input_w] * len(labels))\n",
    "        y.extend(labels)\n",
    "    return np.array(x), np.array(y)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "X, Y = make_xy(indexed)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 1, avg loss 2.611017\n",
      "Nearest to um: que, filme, para, em, una, y, mas, tempo,\n",
      "Nearest to of: kind, worst, life, the, most, films, one, sense,\n",
      "Nearest to the: of, film, movie, all, has, worst, end, problem,\n",
      "Nearest to budding: blessedly, tangle, barbs, comforted, loversontherun, musclefest, hankies, engulfing,\n",
      "Nearest to revolt: adoration, resent, girlwoman, mucking, incarnates, bai, rara, highprofile,\n",
      "Nearest to gellar: interpret, overdue, accident, confining, diego, wordplay, toolong, skatesurf,\n",
      "Nearest to taps: readers, horroraction, nuances, woven, exceeds, sarcastic, oprahfication, beating,\n",
      "Nearest to seeing: worth, come, talking, first, other, ever, blade, once,\n",
      "Nearest to the: of, film, movie, all, has, worst, end, problem,\n",
      "Nearest to questions: extremely, share, guys, events, verve, both, pretty, fine,\n",
      "epoch 2, avg loss 2.186510\n",
      "Nearest to enough: just, sustain, really, quite, isnt, fails, thin, hard,\n",
      "Nearest to theres: here, nothing, lot, no, much, reason, certain, plenty,\n",
      "Nearest to adapted: apted, powerful, nervewracking, thornberrys, strays, from, admittedly, stone,\n",
      "Nearest to in: title, way, right, which, interest, exercise, favor, every,\n",
      "Nearest to the: end, worst, plot, film, same, entire, viewer, all,\n",
      "Nearest to new: york, love, way, year, ever, territory, best, someone,\n",
      "Nearest to you: can, know, think, believe, unless, shake, laugh, dont,\n",
      "Nearest to ghastly: angela, blacklight, qual, nurtures, filter, unleashed, turmoil, embellishment,\n",
      "Nearest to put: together, get, happens, tell, they, believe, thin, themselves,\n",
      "Nearest to you: can, know, think, believe, unless, shake, laugh, dont,\n",
      "epoch 3, avg loss 2.020266\n",
      "Nearest to choice: novels, authority, wake, contorting, yiddish, tradition, diminishing, endorsement,\n",
      "Nearest to the: end, worst, same, effect, original, entire, rest, viewer,\n",
      "Nearest to as: well, neither, shallow, pathetic, near, possible, rating, serves,\n",
      "Nearest to suddenly: reflect, ireland, closest, steering, growth, widow, quietness, chopsocky,\n",
      "Nearest to but: quite, also, pretty, nothing, exactly, anything, maybe, still,\n",
      "Nearest to cheerful: remainnameless, macbeth, amc, imminently, bangup, recommending, chuckling, jokers,\n",
      "Nearest to kids: spy, adults, 2, parents, then, depressing, audiences, stay,\n",
      "Nearest to to: tries, fails, needs, try, hard, trying, wanted, tell,\n",
      "Nearest to west: community, upper, class, middle, modern, weight, water, traveler,\n",
      "Nearest to make: tell, go, they, already, wanted, probably, certainly, laugh,\n",
      "epoch 4, avg loss 1.894947\n",
      "Nearest to is: there, problem, joke, thing, result, disaster, isnt, really,\n",
      "Nearest to sensuous: goodnaturedly, sinner, exigencies, bleakly, littleremembered, anomaly, conspiratorial, hews,\n",
      "Nearest to also: but, taken, storytelling, ultimately, wellmeaning, somewhat, fails, genre,\n",
      "Nearest to spots: dogmalike, noses, stultifying, chins, motocross, veins, spyontherun, denouements,\n",
      "Nearest to in: exercise, place, interested, lesson, terms, execution, right, lost,\n",
      "Nearest to here: there, exactly, did, theres, nothing, why, theyre, interesting,\n",
      "Nearest to tale: cautionary, fairy, morality, comingofage, thoughtful, weightless, tragic, americans,\n",
      "Nearest to a: lot, dreary, weak, little, twist, joke, decent, bit,\n",
      "Nearest to be: remembered, ought, disappointed, supposed, used, appreciated, 1970s, must,\n",
      "Nearest to disturbing: dark, study, identity, sexual, uses, dramatically, surprising, deeply,\n",
      "epoch 5, avg loss 1.781454\n",
      "Nearest to it: goes, thinks, probably, call, seeming, wanted, would, shame,\n",
      "Nearest to but: wasnt, certainly, barbershop, also, serviceable, taken, premise, scattered,\n",
      "Nearest to fun: having, exhilarating, appeal, share, popcorn, slowly, bmovie, good,\n",
      "Nearest to engrossing: portrait, resonant, irresistible, spite, uncompromising, sweetly, affecting, compassionate,\n",
      "Nearest to than: more, less, significantly, smarter, trifle, easier, worse, rather,\n",
      "Nearest to bigscreen: pokemania, rash, overlong, mornings, gimmicky, unprecedented, blowout, thrusts,\n",
      "Nearest to we: ourselves, need, thinking, get, havent, count, know, demand,\n",
      "Nearest to therapy: todays, childish, humdrum, viewpoints, session, moodaltering, mumbojumbo, sessions,\n",
      "Nearest to questioning: ensnaring, election, paintings, payami, openly, oddities, cooks, warp,\n",
      "Nearest to may: admirers, existed, cooperative, presentation, walter, sound, appear, fondly,\n",
      "epoch 6, avg loss 1.682141\n",
      "Nearest to vintage: sip, thirdbest, shirley, wines, tearstained, rival, pulled, mandy,\n",
      "Nearest to expect: youd, thinking, swear, could, say, notatallgood, further, wrong,\n",
      "Nearest to off: hook, kicks, doze, pulls, starts, comes, promisingly, shelves,\n",
      "Nearest to less: than, significantly, worse, depressing, expected, mature, more, predecessor,\n",
      "Nearest to those: basedontruth, baseball, sequels, detriment, susceptible, reaching, digit, groups,\n",
      "Nearest to potential: touched, madefortv, displays, simbolizando, angel, demographic, core, penas,\n",
      "Nearest to a: sketch, rumor, lot, disguised, bit, single, glossy, childrens,\n",
      "Nearest to workplace: rivalry, interfamily, ambition133, horses, eyeballtoeyeball, frisky, wewannour, twinklyeyed,\n",
      "Nearest to a: sketch, rumor, lot, disguised, bit, single, glossy, childrens,\n",
      "Nearest to story: inexorably, construct, tell, telling, leavitt, bogs, spread, elements,\n",
      "epoch 7, avg loss 1.605133\n",
      "Nearest to does: fundamentals, job, romethings, matineestyle, milk, michell, deliver, catch,\n",
      "Nearest to fresh: breath, substantial, reconstruction, sophistication, bring, depth, satisfyingly, formula,\n",
      "Nearest to it: wasnt, hate, thinks, goes, happens, indistinct, shame, wanted,\n",
      "Nearest to body: smacks, leniency, fireball, fixating, humans, some, flashbacks, annoying,\n",
      "Nearest to in: thrown, opinion, exercise, bogs, hole, lesson, terms, execution,\n",
      "Nearest to sugar: brown, playstation, calculations, famuyiwas, hysteria, rick, rescue, admirably,\n",
      "Nearest to about: worrying, mary, schmidt, care, say, something, talking, nafs,\n",
      "Nearest to 40s: 30s, dualistic, crux, bias, demi, clearcut, northwest, selfcentered,\n",
      "Nearest to takes: copflick, slowmoving, laziness, inexplicable, approach, slow, grips, hatinhand,\n",
      "Nearest to their: orphans, dismember, estranged, earn, employ, watches, repelled, peoples,\n",
      "epoch 8, avg loss 1.538825\n",
      "Nearest to legitimate: stringpulling, sams, maggio, clyde, excepting, preparation, unpleasantly, convolutions,\n",
      "Nearest to of: fleetingly, devoid, requires, toy, favor, excruciating, pitfalls, reign,\n",
      "Nearest to strong: wilson, imbued, thanks, weak, erotic, monarch, bought, washingtons,\n",
      "Nearest to finish: start, boilerplate, unsurprisingly, discouraging, fart, burlap, forgettably, rapturous,\n",
      "Nearest to red: dragon, bridge, pen, sharpie, noses, meal, symbolically, rattling,\n",
      "Nearest to and: contrived, indifferent, exposition, stilted, believability, clumsy, wholly, loathsome,\n",
      "Nearest to people: jungle, needing, who, hunger, publicists, surround, tonga, connect,\n",
      "Nearest to to: decide, fails, tries, tell, according, lend, fleshedout, capitalize,\n",
      "Nearest to if: youre, coke, youve, were, schwentke, tubaplaying, understand, werent,\n",
      "Nearest to the: payoff, transporter, worst, entire, authenticity, order, effect, problem,\n",
      "epoch 9, avg loss 1.495071\n",
      "Nearest to many: herrings, presuppose, shallower, triedandtrue, merchandisedtothemax, fide, selfdestructiveness, contradiction,\n",
      "Nearest to unsettling: prognosis, recreates, miyazakis, cooly, namely, teeming, foreboding, selfdestructive,\n",
      "Nearest to of: crackle, reign, toy, pitfalls, favor, requires, fleetingly, arrangements,\n",
      "Nearest to universal: studios, concerns, heated, relays, passions, spirituality, chronicles, subtly,\n",
      "Nearest to forgettable: instantly, meyjess, snowandstuntwork, goodhearted, benign, cheerful, fluffy, bessons,\n",
      "Nearest to taste: thingtype, acquired, candylike, swamp, tanks, coke, exaggeration, fouryearold,\n",
      "Nearest to air: sleaziness, conditioning, desolate, breath, desiccated, selfreferential, urbanity, leaks,\n",
      "Nearest to any: awards, interchangeable, lectures, sacrificing, urine, farts, recreational, pops,\n",
      "Nearest to or: conceptions, almodovar, disgust, either, anatomical, remotely, grandkids, unequivocally,\n",
      "Nearest to of: crackle, reign, toy, pitfalls, favor, requires, fleetingly, arrangements,\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "epoch 10, avg loss 1.450750\n",
      "Nearest to leguizamo: plugged, shouted, elie, thank, twohour, jones, longer, chouraqui,\n",
      "Nearest to it: squanders, thinks, indistinct, plods, 1970s, recommendation, curling, call,\n",
      "Nearest to life: imitating, reaffirms, breathe, aquatic, ravages, romanticized, rural, squeezed,\n",
      "Nearest to space: station, frontier, sterility, breathtakingly, killers, fantasized, unleashes, outerspace,\n",
      "Nearest to a: sketch, doodle, excuse, glossy, mishmash, paintbynumbers, letdown, amber,\n",
      "Nearest to them: freeing, artefact, riddles, redeemable, personally, awake, cqs, enable,\n",
      "Nearest to much: syrup, norma, grueling, zap, asking, distill, yuks, mortons,\n",
      "Nearest to which: broomsticks, shortest, romijnstamos, developmentally, amuses, farmers, unconscious, rebecca,\n",
      "Nearest to ribald: fullthroated, gob, plight, professionalism, development, arrested, trailers, humor,\n",
      "Nearest to seen: havent, youve, weve, 000, already, loser, debated, starter,\n"
     ]
    }
   ],
   "source": [
    "for i in range(epoch):\n",
    "    total_cost = 0\n",
    "    for k in range(0,(X.shape[0] // batch_size) * batch_size,batch_size):\n",
    "        batch_x = X[k:k+batch_size]\n",
    "        batch_y = Y[k:k+batch_size,np.newaxis]\n",
    "        cost,_ = sess.run([model.cost,model.optimizer],feed_dict={model.X:batch_x,\n",
    "                                                                 model.Y:batch_y})\n",
    "        total_cost += cost\n",
    "    total_cost /= (X.shape[0] // batch_size)\n",
    "    print('epoch %d, avg loss %f'%(i+1,total_cost))\n",
    "    random_valid_size = np.random.choice(indexed, valid_size)\n",
    "    similarity = sess.run(model.similarity,feed_dict={model.valid_dataset:random_valid_size})\n",
    "    for no, i in enumerate(random_valid_size):\n",
    "        valid_word = idx2word[i]\n",
    "        nearest = (-similarity[no, :]).argsort()[1:nearest_neighbors + 1]\n",
    "        log_str = 'Nearest to %s:' % valid_word\n",
    "        for k in range(nearest_neighbors):\n",
    "            close_word = idx2word[nearest[k]]\n",
    "            log_str = '%s %s,' % (log_str, close_word)\n",
    "        print(log_str)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.2"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
